suppressPackageStartupMessages({
library(Matrix) # Required to work with sparse matrices
library(dplyr)
library(Seurat)
library(patchwork)
library(future) # For parallelization
library(stringr)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(org.Hs.eg.db)
})
#Set/Create Working Directory to Folder
wd <- "/raida/ayaka.mori/integrative_dp/rna_preprocessing/preprocessing_output_simple"
dir.create(wd, showWarnings = FALSE, recursive = TRUE)
knitr::opts_knit$set(root.dir = wd)
scriptPath <- "/raida/ayaka.mori/integrative_dp/script"
source(paste0(scriptPath, "/matrix_helpers.R"))
source(paste0(scriptPath, "/plotting_config.R"))
source(paste0(scriptPath, "/seurat_helpers.R"))
source(paste0(scriptPath, "/misc_helpers.R"))
source(paste0(scriptPath, "/sample_metadata.R"))
plotDir <- paste0(wd,"/raw_qc")
dir.create(plotDir, showWarnings = FALSE, recursive = TRUE)
# Load each of the scalp datasets
raw_data_dir <- "/raida/ayaka.mori/integrative_dp/scRNA-seq_cellranger"
data_dirs <- list.dirs(path=raw_data_dir, full.names=TRUE, recursive=FALSE)
names(data_dirs) <- str_extract(data_dirs, "(?<=/)[^/]*$")
objs <- list()
for(ix in seq_along(data_dirs)){
sample <- names(data_dirs)[ix]
path <- data_dirs[ix]
message(sprintf("Reading in data from sample %s...", sample))
data <- Read10X(data.dir=path)
obj <- CreateSeuratObject(counts=data, project=sample, min.cells=0, min.features=200) # orig.ident not getting assigned correctly?
obj$orig.ident <- sample
objs[[sample]] <- obj
}
## Reading in data from sample GSM6532919_AA2...
## Reading in data from sample GSM6532920_AA4...
## Reading in data from sample GSM6532921_AA7...
## Reading in data from sample GSM6532922_AA8...
## Reading in data from sample GSM6532923_C_PB1...
## Reading in data from sample GSM6532924_C_PB2...
## Reading in data from sample GSM6532925_C_PB3...
## Reading in data from sample GSM6532926_C_SD1...
## Reading in data from sample GSM6532927_C_SD2...
## Reading in data from sample GSM6532928_C_SD3...
objNames <- sapply(objs, function(x) x@project.name)
# Merge individual Seurat objects
obj <- merge(x=objs[[1]], y=objs[2:length(objs)], add.cell.ids=objNames, project="scalp")
# Identify genes we want to blacklist during clustering
rawCounts <- GetAssayData(object=obj, slot="counts")
# mitochondrial:
mt.genes <- grep(pattern = "^MT-", x = rownames(rawCounts), value = TRUE)
# Cell cycle (These are loaded by Seurat)
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes
# X/Y chromosome genes:
txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
geneGR <- GenomicFeatures::genes(txdb)
## 1690 genes were dropped because they have exons located on both strands
## of the same reference sequence or on more than one reference sequence,
## so cannot be represented by a single genomic range.
## Use 'single.strand.genes.only=FALSE' to get all the genes in a
## GRangesList object, or use suppressMessages() to suppress this message.
sexGenesGR <- geneGR[seqnames(geneGR) %in% c("chrY", "chrX")]
matchedGeneSymbols <- AnnotationDbi::select(org.Hs.eg.db,
keys = sexGenesGR$gene_id,
columns = c("ENTREZID", "SYMBOL"),
keytype = "ENTREZID")
## 'select()' returned 1:1 mapping between keys and columns
sexChr.genes <- matchedGeneSymbols$SYMBOL
# Genes to ignore (just for clustering purposes)
blacklist.genes <- c(
mt.genes,
sexChr.genes,
s.genes,
g2m.genes
)
# Add cell cycle information now (Using Seurat):
obj <- CellCycleScoring(obj, s.features=s.genes, g2m.features=g2m.genes, set.ident=FALSE)
## Warning: The following features are not present in the object: MLF1IP, not
## searching for symbol synonyms
## Warning: The following features are not present in the object: FAM64A, HN1, not
## searching for symbol synonyms
obj[["percent.mt"]] <- PercentageFeatureSet(obj, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(obj, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3, pt.size = 0.1)

#Visualize feature-feature relationships
plot1 <- FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

#Filter cells based on QC metrics
obj <- subset(obj, subset = nFeature_RNA > 200 & nFeature_RNA < 7000 & percent.mt < 10)
#Visualize feature-feature relationships
plot1 <- FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(obj, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

#Normalizing the data
obj <- NormalizeData(obj)
#SCTransform(obj, vars.to.regress = "percent.mt", verbose = FALSE)
obj <- SCTransform(obj, vars.to.regress = "percent.mt", verbose = FALSE)
obj <- RunPCA(obj, features = VariableFeatures(object = obj))
## PC_ 1
## Positive: HLA-DRA, CD74, HLA-DPA1, HLA-DRB1, HLA-DPB1, HLA-DQA1, LYZ, HLA-DQB1, SRGN, CXCL8
## GPR183, IL1B, FTH1, G0S2, HLA-DRB5, SAT1, C15orf48, CD83, TYROBP, CPVL
## FCER1G, BCL2A1, RGS1, FCER1A, AIF1, PLEK, TMSB4X, SERPINB9, IL1R2, INSIG1
## Negative: DCN, CFD, APOD, COL1A2, MGP, PTGDS, COL1A1, COL3A1, CCDC80, CXCL14
## C11orf96, CFH, LUM, COL6A2, FBLN1, APOE, POSTN, TNFAIP6, COL6A1, GSN
## CXCL12, IGFBP5, SPARC, C1R, CTSK, MEG3, CCL2, CXCL1, C1S, SFRP2
## PC_ 2
## Positive: DCN, CFD, HLA-DRA, APOD, CD74, HLA-DPA1, HLA-DRB1, HLA-DPB1, COL1A2, HLA-DQA1
## PTGDS, CST3, MGP, CCDC80, COL3A1, COL1A1, LYZ, C11orf96, HLA-DQB1, CFH
## CXCL8, FTH1, VIM, G0S2, IL1B, GSN, HLA-DRB5, LUM, GPR183, TNFAIP6
## Negative: KRT10, KRT14, KRT1, S100A2, DMKN, SFN, S100A8, KRT5, KRTDAP, S100A9
## PERP, S100A7, LY6D, KRT17, AQP3, SBSN, DSP, KRT16, S100A14, SERPINB2
## KRT6A, LYPD3, FABP5, FGFBP1, LGALS7B, TACSTD2, KRT15, DSC3, SERPINB5, KRT6B
## PC_ 3
## Positive: DCN, CFD, KRT10, KRT1, APOD, HLA-DRA, KRT14, COL1A2, CXCL14, DMKN
## S100A8, HLA-DPA1, PTGDS, HLA-DPB1, S100A9, HLA-DQA1, KRTDAP, CCDC80, CST3, S100A2
## LYZ, COL1A1, SFN, COL3A1, HLA-DRB1, CXCL8, MGP, CD74, S100A7, KRT5
## Negative: SELE, ACKR1, TM4SF1, TAGLN, SPARCL1, IL6, ACTA2, STC1, C11orf96, ADAMTS1
## TPM2, MYL9, C2CD4B, MYH11, CSF3, ADIRF, ADAMTS9, IGFBP7, AVPR1A, AQP1
## IL7R, RGS5, PTPRC, MT1A, MCAM, CXCR4, FABP4, MCTP1, GJA4, PECAM1
## PC_ 4
## Positive: PTPRC, IL7R, CXCR4, SRGN, CD69, CCL5, IL32, CD2, TRBC2, TSC22D3
## LTB, RGS1, TXNIP, TRAC, CLEC2D, ZFP36L2, TRBC1, CD52, CD3D, FYB1
## STK4, KLRB1, CYTIP, SPOCK2, GZMK, ARHGDIB, ALOX5AP, RHOH, DUSP4, TRAT1
## Negative: TM4SF1, SELE, ACKR1, IL6, SPARCL1, C11orf96, STC1, TAGLN, HLA-DRA, ADAMTS1
## CSF3, C2CD4B, CD74, ACTA2, IGFBP7, G0S2, HLA-DRB1, ADAMTS9, S100A8, IFI27
## KRT10, HLA-DPA1, TPM2, ADIRF, MYL9, CCL2, S100A9, AQP1, MYH11, KRT1
## PC_ 5
## Positive: TAGLN, C11orf96, ACTA2, TPM2, MYL9, MYH11, AVPR1A, MT1A, RGS5, GJA4
## CYCS, CCL2, KCNE4, SORBS2, CALD1, CRISPLD2, RRAD, RERGL, MYLK, ADRA2A
## TPM1, ADIRF, MCAM, PLN, MT1M, RHOB, CNN1, ZNF331, PDGFA, SYNM
## Negative: SELE, ACKR1, TM4SF1, STC1, CSF3, C2CD4B, AQP1, IFI27, MCTP1, SPRY1
## FLT1, PECAM1, PLVAP, ADAMTS9, SOX17, IL6, CLDN5, DCN, ZNF385D, CALCRL
## CLU, ADGRL4, CFD, GNG11, TSC22D1, EMP1, EMCN, VWF, RCAN1, VCAM1
ElbowPlot(obj, ndims = 50)

# Store sample information in metadata
obj$Sample <- obj$orig.ident
# Store disease status in metadata
obj$diseaseStatus <- NA
obj$diseaseStatus <- ifelse(grepl("C_SD", obj$Sample), "C_SD", obj$diseaseStatus)
obj$diseaseStatus <- ifelse(grepl("C_PB", obj$Sample), "C_PB", obj$diseaseStatus)
obj$diseaseStatus <- ifelse(grepl("AA", obj$Sample), "AA", obj$diseaseStatus)
library(harmony)
obj <- obj %>% RunHarmony("Sample", plot_convergence = TRUE)
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony 4/10
## Harmony 5/10
## Harmony 6/10
## Harmony 7/10
## Harmony 8/10
## Harmony 9/10
## Harmony 10/10
## Warning: Invalid name supplied, making object name syntactically valid. New
## object name is Seurat..ProjectDim.SCT.harmony; see ?make.names for more details
## on syntax validity

VlnPlot(obj, features = "PC_1", group.by = "Sample", pt.size = 0)

DimPlot(obj, reduction = "harmony", pt.size = 0.1, group.by = "Sample")

DimPlot(obj, reduction = "harmony", pt.size = 0.1, group.by = "diseaseStatus")

ElbowPlot(obj, ndims = 50, reduction = "harmony")

obj <- RunUMAP(obj, reduction = "harmony", dims = 1:20, n.neighbors = 50L, min.dist = 0.5)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 14:36:37 UMAP embedding parameters a = 0.583 b = 1.334
## 14:36:37 Read 52465 rows and found 20 numeric columns
## 14:36:37 Using Annoy for neighbor search, n_neighbors = 50
## 14:36:37 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 14:36:42 Writing NN index file to temp file /tmp/RtmpQV9Mzq/fileeb2a591db1af
## 14:36:42 Searching Annoy index using 1 thread, search_k = 5000
## 14:37:08 Annoy recall = 100%
## 14:37:09 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 50
## 14:37:12 Initializing from normalized Laplacian + noise (using irlba)
## 14:37:19 Commencing optimization for 200 epochs, with 3657800 positive edges
## 14:37:45 Optimization finished
obj <- FindNeighbors(obj, dims = 1:20)
## Computing nearest neighbor graph
## Computing SNN
obj <- FindClusters(obj, resolution = 0.8)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 52465
## Number of edges: 1666475
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9189
## Number of communities: 24
## Elapsed time: 8 seconds
# Store cluster information in metadata
obj$Clusters <- Idents(obj)
sample_colors <- c("GSM6532919_AA2" = "#CAB2D6", "GSM6532920_AA4" = "#B2DF8A", "GSM6532921_AA7" = "#FDBF6F",
"GSM6532922_AA8" = "#A6CCC3", "GSM6532923_C_PB1" = "#E31A1C",
"GSM6532924_C_PB2" = "#33A02C", "GSM6532925_C_PB3" = "#FB9A99", "GSM6532926_C_SD1" = "#FF7F00",
"GSM6532927_C_SD2" = "#CCFFFF", "GSM6532928_C_SD3" = "#000000")
DimPlot(obj,reduction="umap", group.by = "Sample", cols = sample_colors)

DimPlot(obj,reduction="umap", group.by = "Clusters", label = TRUE)

sample_cmap <- readRDS("/raida/ayaka.mori/integrative_dp/script/sample_cmap.rds")
sample_cmap <- sample_cmap[names(sample_cmap) %in% unique(obj$Sample)]
disease_cmap <- head(cmaps_BOR$stallion, 3)
names(disease_cmap) <- c("AA", "C_SD", "C_PB")
# Plot clustering results:
plotDir <- paste0(wd,"/clustering_qc")
dir.create(plotDir, showWarnings = FALSE, recursive = TRUE)
plotClusterQC(obj, subgroup="preclustered", plotDir=plotDir, pointSize=0.2, sampleCmap=sample_cmap, diseaseCmap=disease_cmap)
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'xgroup'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'xgroup'. You can override using the
## `.groups` argument.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## png
## 2
# Save clustered object here:
saveRDS(obj, file = paste0(wd, "/preclustered.rds"))
# annotation
RidgePlot(obj, features = c("KRT14", "KRT5", "KRT15", "KRT10", "SOX9")) #Keratinocytes
## Picking joint bandwidth of 0.129
## Picking joint bandwidth of 0.0875
## Picking joint bandwidth of 0.0449
## Picking joint bandwidth of 0.129
## Picking joint bandwidth of 0.00392

RidgePlot(obj, features = c("CLEC9A", "XCR1")) #Myeloid(CLEC9a.DC)
## Picking joint bandwidth of 2.14e-06
## Picking joint bandwidth of 2.11e-06

RidgePlot(obj, features = c("CLEC9A")) #Myeloid(CLEC9a.DC)
## Picking joint bandwidth of 2.14e-06

RidgePlot(obj, features = c("XCR1")) #Myeloid(CLEC9a.DC)
## Picking joint bandwidth of 2.11e-06

RidgePlot(obj, features = c("CD1A")) #Myeloid(DCs_1)
## Picking joint bandwidth of 0.00386

RidgePlot(obj, features = c("CD14", "CD163")) #Myeloid(Macs_1)
## Picking joint bandwidth of 0.00682
## Picking joint bandwidth of 0.00595

RidgePlot(obj, features = c("CD14")) #Myeloid(Macs_1)
## Picking joint bandwidth of 0.00682

RidgePlot(obj, features = c("CD163")) #Myeloid(Macs_1)
## Picking joint bandwidth of 0.00595

RidgePlot(obj, features = c("CCL19", "CD200")) #Myeloid(M1_Macs)
## Picking joint bandwidth of 0.0285
## Picking joint bandwidth of 0.0175

RidgePlot(obj, features = c("CCL19")) #Myeloid(M1_Macs)
## Picking joint bandwidth of 0.0285

RidgePlot(obj, features = c("CD200")) #Myeloid(M1_Macs)
## Picking joint bandwidth of 0.0175

RidgePlot(obj, features = c("CD79A")) #Myeloid(Plasma)
## Picking joint bandwidth of 0.0217

RidgePlot(obj, features = c("CD3D", "IKZF2", "CCL5", "CD8A"))#Tcells
## Picking joint bandwidth of 0.0168
## Picking joint bandwidth of 2.3e-06
## Picking joint bandwidth of 0.00566
## Picking joint bandwidth of 0.00591

RidgePlot(obj, features = c("KIT", "HPGD", "TPSB2")) #Mastcells
## Picking joint bandwidth of 0.0212
## Picking joint bandwidth of 0.0186
## Picking joint bandwidth of 0.014

RidgePlot(obj, features = c("THY1","COL1A1","COL11A1")) #Fibroblasts
## Picking joint bandwidth of 0.0288
## Picking joint bandwidth of 0.0489
## Picking joint bandwidth of 0.0078

RidgePlot(obj, features = c("MYL9","TPM1","TAGLN")) #Muscle
## Picking joint bandwidth of 0.0353
## Picking joint bandwidth of 0.0515
## Picking joint bandwidth of 0.028

RidgePlot(obj, features = c("SELE","VWF","PECAM1","CD200")) #Endothelial
## Picking joint bandwidth of 0.0161
## Picking joint bandwidth of 0.0188
## Picking joint bandwidth of 0.028
## Picking joint bandwidth of 0.0175

RidgePlot(obj, features = c("LYVE1","FLT4","VWF","PECAM1","CD200")) #Lymphatic
## Picking joint bandwidth of 0.00953
## Picking joint bandwidth of 0.00508
## Picking joint bandwidth of 0.0188
## Picking joint bandwidth of 0.028
## Picking joint bandwidth of 0.0175

RidgePlot(obj, features = c("MLANA", "MITF","SOX10", "KIT")) #Melanocytes
## Picking joint bandwidth of 0.0116
## Picking joint bandwidth of 0.0191
## Picking joint bandwidth of 0.00588
## Picking joint bandwidth of 0.0212

RidgePlot(obj, features = c("SOX10", "ITGB8")) #McSc
## Picking joint bandwidth of 0.00588
## Picking joint bandwidth of 0.0125

RidgePlot(obj, features = c("KLRK1")) #NKG2D+かどうかの確認
## Picking joint bandwidth of 2.1e-06

obj_tcell <- subset(obj, subset = Clusters %in% c("0","1","11")) #Tcells
obj_fib <- subset(obj, subset = Clusters %in% c("2","3","10")) #Fibroblasts
obj_mye <- subset(obj, subset = Clusters %in% c("4","12","22","23")) #Myeloid
obj_end <- subset(obj, subset = Clusters %in% c("5","16")) #Endothelial
obj_kera <- subset(obj, subset = Clusters %in% c("6","7","8","13","15","18")) #Keratinocytes
obj_mus <- subset(obj, subset = Clusters %in% c("9","14")) #Muscle
obj_mast <- subset(obj, subset = Clusters == "17") #Mastcells
obj_mela <- subset(obj, subset = Clusters == "20") #Melanocytes
obj_lym <- subset(obj, subset = Clusters == "21") #Lymphatic
obj_other <- subset(obj, subset = Clusters == "19") #Other
obj.meta <- obj@meta.data
obj.meta$clust_chat <- c("")
obj.meta[colnames(obj_tcell), ] $clust_chat <- "Tcells"
obj.meta[colnames(obj_fib), ] $clust_chat <- "Fibroblasts"
obj.meta[colnames(obj_mye), ] $clust_chat <- "Myeloid"
obj.meta[colnames(obj_end), ] $clust_chat <- "Endothelial"
obj.meta[colnames(obj_kera), ] $clust_chat <- "Keratinocytes"
obj.meta[colnames(obj_mus), ] $clust_chat <- "Muscle"
obj.meta[colnames(obj_mast), ] $clust_chat <- "Mastcells"
obj.meta[colnames(obj_mela), ] $clust_chat <- "Melanocytes"
obj.meta[colnames(obj_lym), ] $clust_chat <- "Lymphatic"
obj.meta[colnames(obj_other), ] $clust_chat <- "Other"
obj.meta.use <- c(colnames(obj_tcell),colnames(obj_fib),colnames(obj_mye),colnames(obj_end),colnames(obj_kera),colnames(obj_mus),colnames(obj_mast),colnames(obj_mela),colnames(obj_lym),colnames(obj_other))
obj <- obj[, obj.meta.use]
obj.meta <- obj.meta[obj.meta.use, ]
obj@meta.data <- obj.meta
saveRDS(obj, file = paste0(wd, "/scalp.rds"))